

### Project: IADB Government Payroll Analytics - Country
### Project leader: Dr Christian Schuster
### Code author (s): Robert Lipiński
### Date last update: (run below or see 'exec_time.csv)
file.info(rstudioapi::getActiveDocumentContext()$path)$mtime


### Script purpose:  Combine 4 raw payroll files into a single file, with only minimal changes to standardize column names and filter
### the selected date rang


### Execution time: ~50 minutes

### Inputs: 
# 1) /data/raw_[format1]/country_codigo.[format1]
# 2) /data/raw_[format1]/country_contrata.[format1]
# 3) /data/raw_[format1]/country_honorarios.[format1]
# 4) /data/raw_[format1]/country_planta.[format1]


### Outputs:
# 1)  /data/raw_[format1]'/country_combined'.[format1]



#
# SET-UP ----------------------------------------------------------------------------------------------------------------------------------------------------
#

### Source the '00_global.R' script with required packages and functions
source(file.path(dirname(rstudioapi::getActiveDocumentContext()$path), '00_country_global.R'))


# Make a copy of the file
file.copy(rstudioapi::getSourceEditorContext()$path,
          gsub('code', 'code/00_ARCHIVE', gsub('\\.R', ' - copy.R', rstudioapi::getSourceEditorContext()$path)),
          overwrite = T, copy.date = T)



### >> NOTES --------------------------------------------------------------------------------------------------------------------------------------
#  planta and contrata should have all the same columns
#  reading .parquet around 4x faster than fread()


# '  -----------------------------------------------------------------------------------------------------------------------------------------------
# READ FILES --------------------------------------------------------------------------------------------------------------------------------------
#

t0 = Sys.time() # record start time

planta = read_flex(file.path(main_dir, 'data', paste0('raw_', format1), 'country_planta'), format = format1) %>%
  add_column(.before = 1, dataset = 'planta') %>% setDT()
gc()

contrata = read_flex(file.path(main_dir, 'data', paste0('raw_', format1), 'country_contrata'), format = format1) %>%
  add_column(.before = 1, dataset = 'contrata') %>% setDT()
gc()

honorarios = read_flex(file.path(main_dir, 'data', paste0('raw_', format1), 'country_honorarios'), format = format1)%>%
  add_column(.before = 1, dataset = 'honorarios') %>% setDT()
gc()

codigo = read_flex(file.path(main_dir, 'data', paste0('raw_', format1), 'country_codigo'), format = format1) %>% 
  add_column(.before = 1, dataset = 'codigo') %>% setDT()
gc()


# (*)checks -------------------------------------------------------------------------------------------------------------------------------------------
### compare column names
setdiff(names(planta), names(honorarios)) %>% sort
setdiff(names(honorarios), names(planta)) %>% sort

setdiff(names(codigo), names(honorarios)) %>% sort
setdiff(names(honorarios), names(honorarios)) %>% sort


'monto_desvin' %in% names(planta)

planta$desc_otrpago

### (*)checks -> honorarios 

## type of pay 
pr_na(honorarios$tipo_pago) # R: 89.2% monthly, 7.7% missing, other types only remaining 3.1%
tapply(honorarios$remuneracionbruta/1000, honorarios$tipo_pago, summary) # pago distribution not drastically different
## across payment types, although pago mensual at somewhat lower median and noticeably higher mean (latter likely due to 
## high outlier values)

## number of cuotas
honorarios$num_cuotas[honorarios$tipo_pago == 'Pago en cuotas'] %>% hist
prop.table(table(honorarios$tipo_pago, is.na(honorarios$num_cuotas)), 1)


### create short codebook
dfs <- list(planta = planta, contrata = contrata, codigo = codigo, honorarios = honorarios)

all_cols <- unique(unlist(lapply(dfs, names)))

codebook <- data.frame(
  column = all_cols,
  missing = sapply(all_cols, function(col) {
    missing_dfs <- names(dfs)[!sapply(dfs, function(x) col %in% names(x))]
    if (length(missing_dfs) == 0) "" else paste(missing_dfs, collapse = ",")
  })
) %>% 
  arrange(column)


print(Sys.time()-t0)
# pr(paste0(planta$anyo,'-', planta$mes))

# '  -----------------------------------------------------------------------------------------------------------------------------------------------
# MATCH COL-NAMES  -----------------------------------------------------------------------------------------------------------------------------------------
#

### needed for row-binding, using column names from planta/contrata (the same columns) as a template
honorarios = honorarios %>% 
  rename(
    # descripcion_funcion - keep different to planta/contrata -> what one is hired to do on this contract, mainly 'asesor'
    # tipo_pago - keep different too -> type of payment, which doesn't apply to standard planta/contrata?
    # desc_otrpago - keep different too -> type of payment (more detailed than tipo_pago), which doesn't apply to standard planta/contrata?
    # num_cuotas - also seems specific to temp. contracts, but don't know what it is -> some people have -1 -> change to NA?
    # report_pdf = any_of('fundesarrolladas'),
    remuneracionbruta_mensual = any_of('remuneracionbruta')  
  )

codigo = codigo %>% 
  rename(
    horasextra = any_of('horas_extra')  # we have daily/nocturnal hours in planta/contrata -> keep this column for now and sum up to it when we add the two
    # tipo_unidad_monetaria_remuneracion_monto_desvinculacion - keep different; related to type of contract - payment for desvinculacion (despido)
    # monto_desvin - keep different;  related to type of contract - payment for desvinculacion (despido)
    # horas_extra = 
  )

### checks [names overlap]
# temp  = codigo[1:10^6, ]
# table(names(temp) %in% names(contrata))
# table(names(contrata) %in% names(temp))
# names(temp)[!(names(temp) %in% names(contrata))]

### CHOOSE: if run ANEW --------------------------------------------------------------------------------------------
anew = T



# '  -----------------------------------------------------------------------------------------------------------------------------------------------
# ROW-BIND FILES -------------------------------------------------------------------------------------------------------------------------------------
#

if(anew | !file.exists(file.path(main_dir, 'Data', 'raw_parquet', 'country_combined.parquet'))){
  
  print('Binding')
  
  # country = rbindlist(list(planta, contrata, honorarios, codigo), fill = T)

  ### Combine (might not work when done all at once, that's way done dataset-by-dataset)
  
  print('Binding 1')
  country = rbindlist(list(planta, contrata), fill = T)
  rm(planta)
  rm(contrata)
  gc()
  
  print('Binding 2')
  country = rbindlist(list(country, honorarios), fill = T)
  rm(honorarios)
  gc()
  
  print('Binding 3')
  country = rbindlist(list(country, codigo), fill = T)
  rm(codigo)
  gc()
  
  
  
  ### drop columns we don't need (at least for now) ---------------------------------------------------------------------------------------------------
  # NOTE: some names listed below might not be present in the updated payroll files, but were there are some point. The function
  # will ignore the names that are not present, but will remove the spare columns in case they are still/again in the files.
  
  # NOTE(2): See 'data/clean/country_codebook.xlsx' for clarification of what each column codes and why it is removed
  
  country =  country %>% dplyr::select(-any_of(c(
    'id_pagina', 'id_pagina_padre', 'camino', # website information
    'asignaciones', # combination of role designations, a lot of possible combinations, which I cannot disentangle into anything useful
    'observaciones', # only 1/3 have value (>10k unique ones) that is not 'sin observaciones'
    'descripcion_funcion', # 82% NA, non-missing contain some additional information on position/duties
    'num_cuotas', # 98.6% NAs
    'fundesarrolladas',
    'tipo_pago', # could be useful as it has info on intervals of pay, but is 82% NAs
    'enlace', # link to report, even if some useful info could be extracted (does NOT seem so), it is 99.4% 'No' observations
    'activado', # >99.99% 'si', all other just 'no'
    'temp' # if checks above created 'temp' column
  )))
  
  
  ### filter years 
  country = country[anyo <= 2024]
  country = country[anyo >= 2019]
  
  
  ### > save combined --------------------------------------------------------------------------------------------------------------------------------------
  print('Saving')
  gc()
  write_flex(country, file.path(main_dir, 'data', paste0('raw_', format1), 'country_combined'), format=format1)
  


}else{
  
  # Read the existing file?
  # country = read_parquet(file.path(main_dir, 'Data', 'Raw', paste0('country_', 'combined', '.parquet')))
  
}


exec_time_fun('exec_time')


# '  -----------------------------------------------------------------------------------------------------------------------------------------------
# FIN DEL CÓDIGO  ---------------------------------------------------------------------------------------------------------------------------------------------------------------
# 